In [1]:
import numpy as np
import pandas as pd

In [6]:
def transform(points_x, k=1):
    n = points_x.shape[0]
    m = 8
    points_z = np.zeros([n, m])
    points_z[:,0] = 1 # add intercept
    points_z[:,1:3] = points_x[:,0:2] # copy over the original features
    points_z[:,3:5] = np.power(points_x[:, 0:2], 2) # powers
    points_z[:,5] = points_x[:, 0] * points_x[:, 1] # product
    points_z[:,6] = np.abs(points_x[:, 0] - points_x[:, 1]) # abs diff
    points_z[:,7] = np.abs(points_x[:, 0] + points_x[:, 1]) # abs sum
    return points_z
    
def solve_linear_regression(pi, labels):
    return np.dot(np.dot(np.linalg.pinv(np.dot(pi.T, pi)), pi.T), labels)

def get_linear_regression_error(pi, labels, gx_vector):
    predictions = np.sign(np.dot(pi, gx_vector))
    return sum ((labels * predictions) < 0) / len(labels)

Data preparation


In [7]:
train = pd.read_csv("./data/in.dta", delim_whitespace=True, header=None).as_matrix()
test = pd.read_csv("./data/out.dta", delim_whitespace=True, header=None).as_matrix()

In [8]:
train_x = train[:,:2]
train_y = train[:,2]

test_x = test[:,:2]
test_y = test[:,2]

In [16]:
train_z = transform(train_x)
test_z = transform(test_x)

Linear regression with cross validation


In [67]:
def argmin(z):
    if not z: return None
    min_val = min(z.values())
    return [k for k in z if z[k] == min_val][0]

In [87]:
def cross_validate_linear_regression(t_size):
    error_val = dict()
    error_out = dict()
    for k in range(3, 8):
        gx_vector = solve_linear_regression(train_z[:t_size, :k+1], train_y[:t_size]) # linear regression without decay
        error_val[k] = get_linear_regression_error(train_z[t_size:, :k+1], train_y[t_size:], gx_vector) # error in sample
        error_out[k] = get_linear_regression_error(test_z[:,:k+1], test_y, gx_vector) # error out of sample
        print("k = %d\t error_val = %.3f\terror_out = %.3f" % (k, error_val[k], error_out[k]))
        
    best_k = argmin(error_val)
    print("\nbest validation k = %d\terror_val = %.3f\t error_out = %.3f" % (best_k, error_val[best_k], error_out[k]))
    
    best_k = argmin(error_out)
    print("\nbest out-of-sample k = %d\terror_val = %.3f\t error_out = %.3f" % (best_k, error_val[best_k], error_out[k]))

Train = 25 point

Validation = 10 points


In [88]:
cross_validate_linear_regression(t_size = 25)


k = 3	 error_val = 0.300	error_out = 0.420
k = 4	 error_val = 0.500	error_out = 0.416
k = 5	 error_val = 0.200	error_out = 0.188
k = 6	 error_val = 0.000	error_out = 0.084
k = 7	 error_val = 0.100	error_out = 0.072

best validation k = 6	error_val = 0.000	 error_out = 0.072

best out-of-sample k = 7	error_val = 0.100	 error_out = 0.072

10 training points

25 validation points


In [90]:
cross_validate_linear_regression(t_size = 10)


k = 3	 error_val = 0.480	error_out = 0.472
k = 4	 error_val = 0.400	error_out = 0.412
k = 5	 error_val = 0.360	error_out = 0.348
k = 6	 error_val = 0.000	error_out = 0.104
k = 7	 error_val = 0.160	error_out = 0.172

best validation k = 6	error_val = 0.000	 error_out = 0.172

best out-of-sample k = 6	error_val = 0.000	 error_out = 0.172

Validation bias


In [103]:
# from math import 
tot_e1 = 0
tot_e2 = 0
tot_e = 0
iterations = 10000
for i in range(iterations):
    e1 = np.random.rand(1)
    e2 = np.random.rand(1)
    tot_e1 += e1
    tot_e2 += e2
    tot_e += min(e1, e2)
    
print(tot_e1 / iterations)
print(tot_e2 / iterations)
print(tot_e / iterations)


[ 0.50102712]
[ 0.50300042]
[ 0.33538324]